Sentiment Analysis on Movie Reviews

Using Logistic Regression Model

0 - negative
1 - somewhat negative
2 - neutral
3 - somewhat positive
4 - positive

Load Libraries



In [29]:

    
import nltk
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

Load & Read Datasets



In [8]:

    
train = pd.read_csv('train.tsv', delimiter='\t')
test = pd.read_csv('test.tsv', delimiter='\t')



In [10]:

    
train.shape, test.shape









    Out[10]:





((156060, 4), (66292, 3))



In [20]:

    
train.head()









    Out[20]:






  
    
      
      PhraseId
      SentenceId
      Phrase
      Sentiment
    
  
  
    
      0
      1
      1
      A series of escapades demonstrating the adage ...
      1
    
    
      1
      2
      1
      A series of escapades demonstrating the adage ...
      2
    
    
      2
      3
      1
      A series
      2
    
    
      3
      4
      1
      A
      2
    
    
      4
      5
      1
      series
      2



In [13]:

    
test.head()









    Out[13]:






  
    
      
      PhraseId
      SentenceId
      Phrase
    
  
  
    
      0
      156061
      8545
      An intermittently pleasing but mostly routine ...
    
    
      1
      156062
      8545
      An intermittently pleasing but mostly routine ...
    
    
      2
      156063
      8545
      An
    
    
      3
      156064
      8545
      intermittently pleasing but mostly routine effort
    
    
      4
      156065
      8545
      intermittently pleasing but mostly routine



In [15]:

    
# unique sentiment labels
train.Sentiment.unique()









    Out[15]:





array([1, 2, 3, 4, 0])



In [21]:

    
train.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 6.0+ MB



In [22]:

    
train.Sentiment.value_counts()









    Out[22]:





2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64



In [23]:

    
train.Sentiment.value_counts() / train.Sentiment.count()









    Out[23]:





2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64

Train Classifier



In [24]:

    
X_train = train['Phrase']
y_train = train['Sentiment']



In [30]:

    
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())
])

text_clf = text_clf.fit(X_train, y_train)



In [31]:

    
X_test = train['Phrase']
predicted = text_clf.predict(X_test)



In [32]:

    
print (np.mean(predicted == y_train))









    



0.668787645777



In [34]:

    
test.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 66292 entries, 0 to 66291
Data columns (total 3 columns):
PhraseId      66292 non-null int64
SentenceId    66292 non-null int64
Phrase        66292 non-null object
dtypes: int64(2), object(1)
memory usage: 2.0+ MB

Create Submission



In [35]:

    
X_test = test['Phrase']
phraseIds = test['PhraseId']
predicted = text_clf.predict(X_test)
output = pd.DataFrame( data={"PhraseId":phraseIds, "Sentiment":predicted} )
#output.to_csv( "submission_logistic_regression.csv", index=False, quoting=3 )

	PhraseId	SentenceId	Phrase	Sentiment
0	1	1	A series of escapades demonstrating the adage ...	1
1	2	1	A series of escapades demonstrating the adage ...	2
2	3	1	A series	2
3	4	1	A	2
4	5	1	series	2

	PhraseId	SentenceId	Phrase
0	156061	8545	An intermittently pleasing but mostly routine ...
1	156062	8545	An intermittently pleasing but mostly routine ...
2	156063	8545	An
3	156064	8545	intermittently pleasing but mostly routine effort
4	156065	8545	intermittently pleasing but mostly routine